#Alexander F. Gazmararian
#afg2@princeton.edu
#January 9, 2024

#Purpose: Process data on mine locations for falsification tests

#load packages
library(tidyverse)
library(tidylog)
library(janitor)
library(maps)
library(snakecase)
library(here)

#load custom script for reading MSHA data
source(here("code", "fun", "read_mines.r"))

# Load MSHA data on employment at mines, which can be used to identify locations
jobs <- read_delim(here("data", "input", "msha", "210528-msha-MinesProdYearly.txt"), "|", escape_double = FALSE, trim_ws = TRUE)

# Clean names
jobs <- clean_names(jobs)
# Remove empty rows
jobs <- remove_empty(jobs)
# Create unique list of MSHA mines
jobs_list <- jobs %>%
  dplyr::select(mine_id, calendar_yr) %>%
  unique()
# Create complete list of msha_id-year 
mine_complete <- jobs_list %>% expand(mine_id, calendar_yr)
# Identify start and stop years of the mine
start_stop <- jobs_list %>%
  group_by(mine_id) %>%
  summarise(min_year = min(calendar_yr),
            max_year = max(calendar_yr))
# Merge complete data with start and stop years
all_mines <- left_join(mine_complete, start_stop, by = "mine_id")
# Remove years from complete data frame when the mine was not in operation
all_mines <- subset(all_mines, calendar_yr >= min_year)
# Create indicator for if a mine closed in a year
all_mines$closed <- ifelse(all_mines$max_year < all_mines$calendar_yr, 1, 0)
# Re-merge with jobs data
jobs_clean <- jobs %>%
 group_by(mine_id, calendar_yr) %>%
 summarise(
    across(
      c(annual_hrs, annual_coal_prod, avg_annual_empl, avg_employee_hours),
      ~ sum(.x, na.rm = TRUE)
    )
  ) %>%
  left_join(all_mines, ., by = c("mine_id", "calendar_yr"))
# Convert mine ID to integer
jobs_clean$mine_id <- as.integer(jobs_clean$mine_id)

#Load mine-level data on coal production
# https://www.eia.gov/coal/data.php
coal_mine_list <- list.files(path = here("data", "input", "eiacoal_minelevel"), pattern = ".csv", full.names = TRUE)
coal_mine_raw <- lapply(coal_mine_list, read_mines)
mine_prod_raw <- do.call("rbind", coal_mine_raw)
# Clean names
mine_prod <- clean_names(mine_prod_raw)
# Convert year to numeric
mine_prod$year <- as.numeric(mine_prod$year)
# Remove empty rows and columns
mine_prod <- remove_empty(mine_prod)
# Drop columns
mine_prod$state <- NULL
basin <- subset(mine_prod, select = c(msha_id, basin))
basin <- unique(basin)
mine_prod$basin <- NULL
# Pivot from long to wide
mine_prod_wide <- mine_prod %>%
  pivot_wider(
    id_cols = c(msha_id, year),
    names_from = c(coal_rank),
    values_from = c(total_production),
    values_fill = 0
  ) %>%
  clean_names() %>%
  dplyr::select(-contains("prp"))
#Rename
names(mine_prod_wide) <- gsub("total_production_", "", names(mine_prod_wide))
#create indicator for coal
coal <- mine_prod$msha_id %>%
  data.frame("msha_id" = .) %>%
  unique() %>%
  mutate(coal = 1)

# Mine List from MSHA
# https://www.msha.gov/mine-data-retrieval-system
mines <- read_delim(here("data", "input", "msha", "210527-msha-mines.txt"), "|", escape_double = FALSE, trim_ws = TRUE)
# Fix names and remove empty rows and columns
mines <- mines %>%
  clean_names(.) %>%
  remove_empty(.)
#load state abbreviation to fips codes
statefipsabs <- read.csv("https://gist.githubusercontent.com/dantonnoriega/bf1acd2290e15b91e6710b6fd3be0a53/raw/11d15233327c8080c9646c7e1f23052659db251d/us-state-ansi-fips.csv")
statefipsabs$stusps <- trimws(statefipsabs$stusps, "both")
mines <- merge(mines, statefipsabs, by.x = "state", by.y = "stusps", all.x = TRUE)
# Create MSHA to FIPS crosswalk
mines$fips <- paste0(mines$st, mines$fips_cnty_cd)
mines <- subset(mines, select = c(mine_id, fips, primary_sic))
mines <- unique(mines)
mines$mine_id <- as.numeric(mines$mine_id)
# Merge
mine <- left_join(jobs_clean, mine_prod_wide, by = c("mine_id" = "msha_id", "calendar_yr" = "year")) %>%
  left_join(., coal, by = c("mine_id" = "msha_id")) %>%
  left_join(., mines, by = c("mine_id"))
#Indicator for coal mines
mine$coal <- with(mine, ifelse(is.na(coal), 0, 1))
#Save mines
saveRDS(mine, here("data", "inter", "coalmines.rds"))

# Drop non-coal mines
noncoal <- subset(mine, coal == 0)

#process and save save non-coal mine output
# Rename year variable
names(noncoal)[2] <- "year"
# Recode all NAs (except for 2001)
noncoal <- mutate(noncoal, across(closed:coal, ~ ifelse(year != 2000 & is.na(.x), 0, .x)))
# fix fips codes
noncoal$fips <- as.numeric(noncoal$fips)
#Aggregate
noncoal_agg <- noncoal %>%
  group_by(fips, year, primary_sic) %>%
  summarise(
    across(closed:avg_employee_hours, ~ sum(.x, na.rm = TRUE)),
    noncoalmine = 1
  )
noncoal_agg$primary_sic <- to_lower_camel_case(noncoal_agg$primary_sic)
noncoal_agg$annual_coal_prod <- NULL
#pivot wider
noncoal_agg_wide <- noncoal_agg %>%
  pivot_wider(id_cols = c(fips, year), names_from = primary_sic, values_from = c(noncoalmine, closed, annual_hrs, avg_annual_empl, avg_employee_hours))
#Subset to falsification tests
noncoal_agg_wide <- subset(noncoal_agg_wide, select = c(year,fips,noncoalmine_ironOre,noncoalmine_silverOre,noncoalmine_dimensionSandstone))
#Save output
saveRDS(noncoal_agg_wide, here("data", "inter", "noncoalmines.rds"))
